A care management organisation called WeCare wants to identify among its diabetic patients, the ones that are at high risk of getting re-admitted to the hospital. They wish to intervene by providing some incentive to these patients that will help them improve their health. As the star analyst of this organisation, your job is to identify high-risk diabetic patients through risk stratification. This will help the payer to decide what are the right intervention programs for these patients.
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format
import warnings
warnings.filterwarnings('ignore')
# Util fuction: line seperator
def print_ln():
print('-' * 80, '\n')
# Loading in the dataset
diabetic_patient_data_orig = pd.read_csv('../resources/diabetic_data.csv')
# Create a working copy
diabetic_patient_data = diabetic_patient_data_orig.copy()
# Exploring the shape and info about the dataset
print('Dataframe Shape: ', diabetic_patient_data.shape)
print_ln()
print("Dataframe Info: \n")
diabetic_patient_data.info()
print_ln()
# Inspecting the head of the dataset
diabetic_patient_data.head(5)
# NOTE replace the `?` as `nan`
diabetic_patient_data = diabetic_patient_data.replace('?', np.nan)
diabetic_patient_data['medical_specialty'].replace({np.nan: 'Unknown'}, inplace=True)
# `encounter_id` is redundant for our purpose, so we can drop that as well
diabetic_patient_data = diabetic_patient_data.drop(['encounter_id'], axis=1)
# `patient_nbr` is redundant for our purpose, so we can drop that as well
diabetic_patient_data = diabetic_patient_data.drop(['patient_nbr'], axis=1)
# dropping the `diag` codes as well since they don't add direct value to the analysis
diabetic_patient_data = diabetic_patient_data.drop(['diag_1', 'diag_2', 'diag_3'], axis=1)
# Analyse the missing values
columns_with_missing_data = round(100 * (diabetic_patient_data.isnull().sum() / len(diabetic_patient_data.index)), 2)
columns_with_missing_data[columns_with_missing_data > 30].plot(kind='bar')
plt.show()
# We can see that Weight column is almost completely empty and therefore can be dropped
diabetic_patient_data = diabetic_patient_data.drop(['weight'], axis=1)
# `payer_code` is redundant for our purpose, so we can drop that as well
diabetic_patient_data = diabetic_patient_data.drop(['payer_code'], axis=1)
diabetic_patient_data['readmitted'] = diabetic_patient_data['readmitted'].replace('>30', 'YES')
diabetic_patient_data['readmitted'] = diabetic_patient_data['readmitted'].replace('<30', 'YES')
diabetic_patient_data = diabetic_patient_data.drop_duplicates()
readmitted_df = diabetic_patient_data["readmitted"].value_counts()
readmitted_df
diabetic_patient_data_rate = readmitted_df[1] / (readmitted_df[1] + readmitted_df[0])
diabetic_patient_data_rate
print("Total readmission Count = {}".format(readmitted_df[1]))
print("Total Non-readmission Count = {}".format(readmitted_df[0]))
print("Readmission Rate = {:.2f}%".format(diabetic_patient_data_rate*100))
print_ln()
def type_features(data):
categorical_features = data.select_dtypes(include=["object"]).columns
numerical_features = data.select_dtypes(exclude=["object"]).columns
print("categorical_features :", categorical_features)
print_ln()
print("numerical_features:", numerical_features)
print_ln()
return categorical_features, numerical_features
diabetic_patient_data_cat_features, diabetic_patient_data_num_features = type_features(diabetic_patient_data)
diabetic_patient_data_num_features = [
'time_in_hospital',
'num_lab_procedures',
'num_procedures',
'num_medications',
'number_outpatient',
'number_emergency',
'number_inpatient',
'number_diagnoses']
diabetic_patient_data_num_features_df = diabetic_patient_data[diabetic_patient_data_num_features]
diabetic_patient_data_num_features_df.describe()
diabetic_patient_data_num_features_df.info()
for a_num_feature in diabetic_patient_data_num_features:
sns.FacetGrid(diabetic_patient_data, hue="readmitted", height=6).map(sns.distplot, a_num_feature).add_legend()
plt.show()
diabetic_patient_data_num_features_df = diabetic_patient_data[diabetic_patient_data_num_features]
diabetic_patient_data_num_features_df.head()
# Create correlation matrix
corr_matrix = diabetic_patient_data_num_features_df.corr().abs()
# plotting correlations on a heatmap
# figure size
plt.figure(figsize=(18,10))
# heatmap
sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True)
plt.show()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.80
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.60)]
print("HIGHLY CORRELATED FEATURES IN DATA SET:{}\n\n{}".format(len(high_corr_features), high_corr_features))
diabetic_patient_data_cat_features = [
'admission_type_id', # NOTE cat-encoded numerical values
# 'discharge_disposition_id', # NOTE cat-encoded numerical values
# 'admission_source_id', # NOTE cat-encoded numerical values
'race',
'gender',
'age',
'medical_specialty',
'max_glu_serum', # NOTE has low variance
'A1Cresult',
# diabetes-med-start
'metformin',
'repaglinide',
'nateglinide',
'chlorpropamide',
'glimepiride',
'acetohexamide',
'glipizide',
'glyburide',
'tolbutamide',
'pioglitazone',
'rosiglitazone',
'acarbose',
'miglitol',
'troglitazone',
'tolazamide',
'examide',
'citoglipton',
'insulin',
'glyburide-metformin',
'glipizide-metformin',
'glimepiride-pioglitazone',
'metformin-rosiglitazone',
'metformin-pioglitazone',
# diabetes-med-end
'change',
'diabetesMed',
'readmitted'
]
for a_cat_feat in diabetic_patient_data_cat_features:
print(diabetic_patient_data[a_cat_feat].value_counts().count())
print(diabetic_patient_data[a_cat_feat].value_counts())
print_ln()
# scaling the features
from sklearn.preprocessing import scale
for a_num_feat in diabetic_patient_data_num_features:
diabetic_patient_data[a_num_feat] = pd.DataFrame(scale(diabetic_patient_data[a_num_feat]))
diabetic_patient_data.head()
for a_cat_feat in diabetic_patient_data_cat_features:
tmp = pd.get_dummies(diabetic_patient_data[a_cat_feat], prefix=a_cat_feat, drop_first=True)
diabetic_patient_data = pd.concat([diabetic_patient_data, tmp], axis=1)
diabetic_patient_data = diabetic_patient_data.drop([a_cat_feat], 1)
diabetic_patient_data.head()
diabetic_patient_data.columns.to_list()
def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
clean_dataset(diabetic_patient_data)
diabetic_patient_data.to_csv("../_resources/diabetic_patient_data_cleansed.csv", sep=',')
# split into train and test
from sklearn.model_selection import train_test_split
X = diabetic_patient_data.loc[:, diabetic_patient_data.columns != 'readmitted_YES']
y = diabetic_patient_data.loc[:, 'readmitted_YES']
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.7,
test_size=0.3, random_state=100)
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier
# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train, y_train)
# Let's check the evaluation metrics of our default model
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix
# Making predictions
y_pred_default = dt_default.predict(X_test)
# Printing classification report
print(classification_report(y_test, y_pred_default))
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred_default))
# Confidence level per prediction
dt_pred_prob = dt_default.predict_proba(X_train)
dt_pred_prob
# Importing required packages for visualization
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus, graphviz
# Putting features
features = list(diabetic_patient_data.columns[1:])
features
# plotting tree with max_depth=3
dot_data = StringIO()
export_graphviz(dt_default, out_file=dot_data,
feature_names=features, filled=True,rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# GridSearchCV to find optimal max_depth
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'max_depth': range(1, 40)}
# instantiate the model
dtree = DecisionTreeClassifier(criterion = "gini",
random_state = 100)
# fit tree on training data
tree = GridSearchCV(dtree, parameters, cv=n_folds,return_train_score=True,scoring="accuracy")
tree.fit(X_train, y_train)
# scores of GridSearch CV
scores = tree.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_max_depth"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
training data and consequently underperforming in case of testing data¶# GridSearchCV to find optimal max_depth
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'min_samples_leaf': range(5, 200, 20)}
# instantiate the model
dtree = DecisionTreeClassifier(criterion = "gini",
random_state = 100)
# fit tree on training data
tree = GridSearchCV(dtree, parameters,
cv=n_folds, return_train_score=True,
scoring="accuracy")
tree.fit(X_train, y_train)
# scores of GridSearch CV
scores = tree.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with min_samples_leaf
plt.figure()
plt.plot(scores["param_min_samples_leaf"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_min_samples_leaf"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("min_samples_leaf")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
min_samples_leaf we notice that the accuracy of the model stabilizes after the value of 150¶# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# specify number of folds for k-fold CV
n_folds = 5
# parameters to build the model on
parameters = {'min_samples_split': range(5, 200, 20)}
# instantiate the model
dtree = DecisionTreeClassifier(criterion = "gini",
random_state = 100)
# fit tree on training data
tree = GridSearchCV(dtree, parameters,
cv=n_folds, return_train_score=True,
scoring="accuracy")
tree.fit(X_train, y_train)
# scores of GridSearch CV
scores = tree.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with min_samples_leaf
plt.figure()
plt.plot(scores["param_min_samples_split"],
scores["mean_train_score"],
label="training accuracy")
plt.plot(scores["param_min_samples_split"],
scores["mean_test_score"],
label="test accuracy")
plt.xlabel("min_samples_split")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
min_samples_split we notice that the accuracy of the model stabilizes after the value of 150¶# Create the parameter grid
param_grid = {
'max_depth': range(5, 15, 5),
'min_samples_leaf': range(50, 200, 50),
'min_samples_split': range(50, 200, 50),
'criterion': ["entropy", "gini"]
}
n_folds = 5
# Instantiate the grid search model
dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid,
cv = n_folds, verbose = 1,return_train_score=True)
# Fit the grid search to the data
grid_search.fit(X_train,y_train)
# cv results
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results
# printing the optimal accuracy score and hyperparameters
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)
# model with optimal hyperparameters
clf_gini = DecisionTreeClassifier(criterion = "gini",
random_state = 100,
max_depth=10,
min_samples_leaf=50,
min_samples_split=50)
clf_gini.fit(X_train, y_train)
# accuracy score
clf_gini.score(X_test,y_test)
# plotting the tree
dot_data = StringIO()
export_graphviz(clf_gini, out_file=dot_data,feature_names=features,filled=True,rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# tree with max_depth = 3
clf_gini = DecisionTreeClassifier(criterion = "gini",
random_state = 100,
max_depth=3,
min_samples_leaf=50,
min_samples_split=50)
clf_gini.fit(X_train, y_train)
# score
print(clf_gini.score(X_test,y_test))
# plotting tree with max_depth=3
dot_data = StringIO()
export_graphviz(clf_gini, out_file=dot_data,feature_names=features,filled=True,rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# classification metrics
from sklearn.metrics import classification_report,confusion_matrix
y_pred = clf_gini.predict(X_test)
print(classification_report(y_test, y_pred))
# confusion matrix
print(confusion_matrix(y_test,y_pred))
from sklearn.neighbors import KNeighborsClassifier
# Train the KNN classifier for 2 neighbours
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(X_train, y_train)
# Use the classifier to predict the values from the X_test dataset
y_pred = classifier.predict(X_test)
# As an objective measure, we use the confusion matrix to see how the model performed
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
# Find the confidence level of each prediction
knn_pred_prob = classifier.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, knn_pred_prob[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
n_neighbours to obtain an optimum value¶from sklearn.neighbors import KNeighborsClassifier
error = []
# Calculating error for K values between 1 and 10
for i in range(1, 10):
print("Currently training with : ", i)
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
error
plt.figure(figsize=(12, 6))
plt.plot(range(1, 10), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
classifier = KNeighborsClassifier(n_neighbors=8,
algorithm='kd_tree')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
knn_pred_prob = classifier.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, knn_pred_prob[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
Given the domain and the close coordination needed with the Doctors/ Medical personnel, I think it's better to go with a model with higher explicability such that the analysis as well as the Medical Personnel is completely aware of why and how the model arrived at a specific conclusion.
Use trained model to stratify your population into 3 risk buckets:
# Here are the predicted probability using the Decision Tree model
dt_pred_prob
stratification_array = []
for a_confidence_arr in dt_pred_prob:
risk = a_confidence_arr[0]
if ( risk >= 0.7 ):
stratification_array.append('High')
elif ( 0.3 <= risk < 0.7 ):
stratification_array.append('Medium')
else:
stratification_array.append('Low')
# Add the `confidence_YES` column to the dataframe and assign it the inferred stratification values
X_train['confidence_YES'] = stratification_array
X_train